SeqApiPop - 629 échantillons - MAF > 0.01
LD pruning = 0.3 (fenêtre de 1749 SNPs et pas de 175 bp)
# LD03 - CV Error plot
setwd("~/Documents/Stage_NB/data/maf001_LD03")
cv_error <- read.table("SeqApiPop_629_maf001_LD03_1.cv.error", header = F)
# Étape 1: Créer une liste vide pour stocker les données
liste_de_donnees <- list()
# Étape 2: Parcourir les fichiers
for (i in 1:30) {
# Générer le nom du fichier
merge_cv_error <- paste0('SeqApiPop_629_maf001_LD03_', i, '.cv.error')
# Lire les données du fichier
donnees <- read.table(merge_cv_error, header = FALSE)
# Ajouter les données à la liste
liste_de_donnees[[i]] <- donnees
}
# Étape 3: Combiner les données en une seule structure
donnees_combinees <- do.call(rbind, liste_de_donnees)
# Étape 4: Enregistrer le résultat final sans numéro de lignes
write.table(donnees_combinees, "merge_cv_error", sep = "\t", col.names = FALSE, row.names = FALSE)
merge_cv_error <- read.table("merge_cv_error", header = F)
point_min <- merge_cv_error[which.min(merge_cv_error[, 2]), ]
# cv error boxplot
ggplot(merge_cv_error, aes(x = factor(merge_cv_error[,1]), y = merge_cv_error[,2])) +
geom_hline(
yintercept = c(0.475, 0.480, 0.485, 0.490, 0.495, 0.500, 0.505),
color = "black",
linetype = "solid",
size = 0.5
) +
geom_boxplot(width = 0.5, fill = "yellow", color = "black") +
labs(title = "Cross-validation Error Plot",
x = "K",
y = "CV") +
scale_y_continuous(labels = scales::number_format(accuracy = 0.001)) +
theme_minimal() +
theme(
panel.border = element_rect(color = "black", fill = NA, size = 1),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank()
) +
coord_cartesian(ylim = c(0.475, 0.505))
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: The `size` argument of `element_rect()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# jitter plot
ggplot(merge_cv_error, aes(x = factor(merge_cv_error[,1]), y = merge_cv_error[,2])) +
geom_hline(
yintercept = c(0.475, 0.480, 0.485, 0.490, 0.495, 0.500, 0.505),
color = "black",
linetype = "solid",
size = 0.5
) +
geom_boxplot(width = 0.5, fill = "yellow", color = "black", outlier.shape = NA) +
geom_jitter(width = 0.2, alpha = 0.7, color = "red") +
labs(title = "Cross-validation Error Plot",
x = "K",
y = "CV") +
scale_y_continuous(labels = scales::number_format(accuracy = 0.001)) +
theme_minimal() +
theme(
panel.border = element_rect(color = "black", fill = NA, size = 1),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank()
) +
coord_cartesian(ylim = c(0.475, 0.505))

# Filtrer les données pour inclure uniquement K = 3, 4, 5, 6, 7, 8, 9, 10, 11, 12
filtered_data <- merge_cv_error[merge_cv_error[,1] %in% c(3, 4, 5, 6, 7, 8, 9, 10, 11, 12), ]
# Créer le jitter plot avec les données filtrées
ggplot(filtered_data, aes(x = factor(V1), y = V2)) +
geom_hline(
yintercept = c(0.475, 0.480, 0.485, 0.490, 0.495, 0.500),
color = "black",
linetype = "solid",
size = 0.5
) +
geom_boxplot(width = 0.5, fill = "yellow", color = "black", outlier.shape = NA) +
geom_jitter(width = 0.2, alpha = 0.7, color = "red") +
labs(title = "Cross-validation Error Plot",
x = "K",
y = "CV") +
scale_y_continuous(labels = scales::number_format(accuracy = 0.001)) +
theme_minimal() +
theme(
panel.border = element_rect(color = "black", fill = NA, size = 1),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank()
) +
coord_cartesian(ylim = c(0.475, 0.5))

LD pruning = 0.2 (fenêtre de 1749 SNPs et pas de 175 bp)
##### LD02
setwd("~/Documents/Stage_NB/data/maf001_LD02")
cv_error <- read.table("SeqApiPop_629_maf001_LD02_1.cv.error", header = F)
# Étape 1: Créer une liste vide pour stocker les données
liste_de_donnees <- list()
# Étape 2: Parcourir les fichiers
for (i in 1:10) {
# Générer le nom du fichier
merge_cv_error <- paste0('SeqApiPop_629_maf001_LD02_', i, '.cv.error')
# Lire les données du fichier
donnees <- read.table(merge_cv_error, header = FALSE)
# Ajouter les données à la liste
liste_de_donnees[[i]] <- donnees
}
# Étape 3: Combiner les données en une seule structure
donnees_combinees <- do.call(rbind, liste_de_donnees)
# Étape 4: Enregistrer le résultat final sans numéro de lignes
write.table(donnees_combinees, "merge_cv_error", sep = "\t", col.names = FALSE, row.names = FALSE)
merge_cv_error <- read.table("merge_cv_error", header = F)
point_min <- merge_cv_error[which.min(merge_cv_error[, 2]), ]
# boxplot LD02
ggplot(merge_cv_error, aes(x = factor(merge_cv_error[,1]), y = merge_cv_error[,2])) +
geom_hline(
yintercept = c(0.430, 0.435, 0.440, 0.445, 0.450),
color = "black",
linetype = "solid",
size = 0.5
) +
geom_boxplot(width = 0.5, fill = "yellow", color = "black") +
labs(title = "Cross-validation Error Plot",
x = "K",
y = "CV") +
scale_y_continuous(labels = scales::number_format(accuracy = 0.001)) +
theme_minimal() +
theme(
panel.border = element_rect(color = "black", fill = NA, size = 1),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank()
) +
coord_cartesian(ylim = c(0.430, 0.450))

# jitter plot LD02
ggplot(merge_cv_error, aes(x = factor(merge_cv_error[,1]), y = merge_cv_error[,2])) +
geom_hline(
yintercept = c(0.430, 0.435, 0.440, 0.445, 0.450),
color = "black",
linetype = "solid",
size = 0.5
) +
geom_boxplot(width = 0.5, fill = "yellow", color = "black", outlier.shape = NA) +
geom_jitter(width = 0.2, alpha = 0.7, color = "red") +
labs(title = "Cross-validation Error Plot",
x = "K",
y = "CV") +
scale_y_continuous(labels = scales::number_format(accuracy = 0.001)) +
theme_minimal() +
theme(
panel.border = element_rect(color = "black", fill = NA, size = 1),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank()
) +
coord_cartesian(ylim = c(0.430, 0.450))

LD pruning = 0.1 (fenêtre de 1749 SNPs et pas de 175 bp)
##### LD01
setwd("~/Documents/Stage_NB/data/maf001_LD01")
cv_error <- read.table("SeqApiPop_629_maf001_LD01_1.cv.error", header = F)
# Étape 1: Créer une liste vide pour stocker les données
liste_de_donnees <- list()
# Étape 2: Parcourir les fichiers
for (i in 1:10) {
# Générer le nom du fichier
merge_cv_error <- paste0('SeqApiPop_629_maf001_LD01_', i, '.cv.error')
# Lire les données du fichier
donnees <- read.table(merge_cv_error, header = FALSE)
# Ajouter les données à la liste
liste_de_donnees[[i]] <- donnees
}
# Étape 3: Combiner les données en une seule structure
donnees_combinees <- do.call(rbind, liste_de_donnees)
# Étape 4: Enregistrer le résultat final sans numéro de lignes
write.table(donnees_combinees, "merge_cv_error", sep = "\t", col.names = FALSE, row.names = FALSE)
merge_cv_error <- read.table("merge_cv_error", header = F)
point_min <- merge_cv_error[which.min(merge_cv_error[, 2]), ]
# boxplot LD01
ggplot(merge_cv_error, aes(x = factor(merge_cv_error[,1]), y = merge_cv_error[,2])) +
geom_hline(
yintercept = c(0.370, 0.375, 0.380, 0.385, 0.390),
color = "black",
linetype = "solid",
size = 0.5
) +
geom_boxplot(width = 0.5, fill = "yellow", color = "black") +
labs(title = "Cross-validation Error Plot",
x = "K",
y = "CV") +
scale_y_continuous(labels = scales::number_format(accuracy = 0.001)) +
theme_minimal() +
theme(
panel.border = element_rect(color = "black", fill = NA, size = 1),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank()
) +
coord_cartesian(ylim = c(0.370, 0.390))

# jitter plot LD01
ggplot(merge_cv_error, aes(x = factor(merge_cv_error[,1]), y = merge_cv_error[,2])) +
geom_hline(
yintercept = c(0.370, 0.375, 0.380, 0.385, 0.390),
color = "black",
linetype = "solid",
size = 0.5
) +
geom_boxplot(width = 0.5, fill = "yellow", color = "black", outlier.shape = NA) +
geom_jitter(width = 0.2, alpha = 0.7, color = "red") +
labs(title = "Cross-validation Error Plot",
x = "K",
y = "CV") +
scale_y_continuous(labels = scales::number_format(accuracy = 0.001)) +
theme_minimal() +
theme(
panel.border = element_rect(color = "black", fill = NA, size = 1),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank()
) +
coord_cartesian(ylim = c(0.370, 0.390))

LD pruning = 0.05 (fenêtre de 1749 SNPs et pas de 175 bp)
##### LD005
setwd("~/Documents/Stage_NB/data/maf001_LD005")
cv_error <- read.table("SeqApiPop_629_maf001_LD005_1.cv.error", header = F)
# Étape 1: Créer une liste vide pour stocker les données
liste_de_donnees <- list()
# Étape 2: Parcourir les fichiers
for (i in 1:10) {
# Générer le nom du fichier
merge_cv_error <- paste0('SeqApiPop_629_maf001_LD005_', i, '.cv.error')
# Lire les données du fichier
donnees <- read.table(merge_cv_error, header = FALSE)
# Ajouter les données à la liste
liste_de_donnees[[i]] <- donnees
}
# Étape 3: Combiner les données en une seule structure
donnees_combinees <- do.call(rbind, liste_de_donnees)
# Étape 4: Enregistrer le résultat final sans numéro de lignes
write.table(donnees_combinees, "merge_cv_error", sep = "\t", col.names = FALSE, row.names = FALSE)
merge_cv_error <- read.table("merge_cv_error", header = F)
# boxplot LD01
ggplot(merge_cv_error, aes(x = factor(merge_cv_error[,1]), y = merge_cv_error[,2])) +
geom_hline(
yintercept = c(0.460, 0.465, 0.470, 0.475, 0.480),
color = "black",
linetype = "solid",
size = 0.5
) +
geom_boxplot(width = 0.5, fill = "yellow", color = "black") +
labs(title = "Cross-validation Error Plot",
x = "K",
y = "CV") +
scale_y_continuous(labels = scales::number_format(accuracy = 0.001)) +
theme_minimal() +
theme(
panel.border = element_rect(color = "black", fill = NA, size = 1),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank()
) +
coord_cartesian(ylim = c(0.460, 0.480))

# jitter boxplot LD01
ggplot(merge_cv_error, aes(x = factor(merge_cv_error[,1]), y = merge_cv_error[,2])) +
geom_hline(
yintercept = c(0.460, 0.465, 0.470, 0.475, 0.480),
color = "black",
linetype = "solid",
size = 0.5
) +
geom_boxplot(width = 0.5, fill = "yellow", color = "black", outlier.shape = NA) +
geom_jitter(width = 0.2, alpha = 0.7, color = "red") +
labs(title = "Cross-validation Error Plot",
x = "K",
y = "CV") +
scale_y_continuous(labels = scales::number_format(accuracy = 0.001)) +
theme_minimal() +
theme(
panel.border = element_rect(color = "black", fill = NA, size = 1),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank()
) +
coord_cartesian(ylim = c(0.460, 0.480))

LD pruning = 0.04 (fenêtre de 1749 SNPs et pas de 175 bp)
##### LD004
setwd("~/Documents/Stage_NB/data/maf001_LD004")
cv_error <- read.table("SeqApiPop_629_maf001_LD004_1.cv.error", header = F)
# Étape 1: Créer une liste vide pour stocker les données
liste_de_donnees <- list()
# Étape 2: Parcourir les fichiers
for (i in 1:10) {
# Générer le nom du fichier
merge_cv_error <- paste0('SeqApiPop_629_maf001_LD004_', i, '.cv.error')
# Lire les données du fichier
donnees <- read.table(merge_cv_error, header = FALSE)
# Ajouter les données à la liste
liste_de_donnees[[i]] <- donnees
}
# Étape 3: Combiner les données en une seule structure
donnees_combinees <- do.call(rbind, liste_de_donnees)
# Étape 4: Enregistrer le résultat final sans numéro de lignes
write.table(donnees_combinees, "merge_cv_error", sep = "\t", col.names = FALSE, row.names = FALSE)
merge_cv_error <- read.table("merge_cv_error", header = F)
# CV error boxplot LD004
ggplot(merge_cv_error, aes(x = factor(merge_cv_error[,1]), y = merge_cv_error[,2])) +
geom_hline(
yintercept = c(0.575, 0.58, 0.585, 0.59, 0.595, 0.6),
color = "black",
linetype = "solid",
size = 0.5
) +
geom_boxplot(width = 0.5, fill = "yellow", color = "black") +
labs(title = "Cross-validation Error Plot",
x = "K",
y = "CV") +
scale_y_continuous(labels = scales::number_format(accuracy = 0.001)) +
theme_minimal() +
theme(
panel.border = element_rect(color = "black", fill = NA, size = 1),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank()
) +
coord_cartesian(ylim = c(0.575, 0.6))

# jitter plot LD004
ggplot(merge_cv_error, aes(x = factor(merge_cv_error[,1]), y = merge_cv_error[,2])) +
geom_hline(
yintercept = c(0.575, 0.58, 0.585, 0.59, 0.595, 0.6),
color = "black",
linetype = "solid",
size = 0.5
) +
geom_boxplot(width = 0.5, fill = "yellow", color = "black", outlier.shape = NA) +
geom_jitter(width = 0.2, alpha = 0.7, color = "red") +
labs(title = "Cross-validation Error Plot",
x = "K",
y = "CV") +
scale_y_continuous(labels = scales::number_format(accuracy = 0.001)) +
theme_minimal() +
theme(
panel.border = element_rect(color = "black", fill = NA, size = 1),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank()
) +
coord_cartesian(ylim = c(0.575, 0.6))

LD pruning = 0.03 (fenêtre de 1749 SNPs et pas de 175 bp)
##### LD003
setwd("~/Documents/Stage_NB/data/maf001_LD003")
# Étape 1: Créer une liste vide pour stocker les données
liste_de_donnees <- list()
# Étape 2: Parcourir les fichiers
for (i in 1:10) {
# Générer le nom du fichier
merge_cv_error <- paste0('SeqApiPop_629_maf001_LD003_', i, '.cv.error')
# Lire les données du fichier
donnees <- read.table(merge_cv_error, header = FALSE)
# Ajouter les données à la liste
liste_de_donnees[[i]] <- donnees
}
# Étape 3: Combiner les données en une seule structure
donnees_combinees <- do.call(rbind, liste_de_donnees)
# Étape 4: Enregistrer le résultat final sans numéro de lignes
write.table(donnees_combinees, "merge_cv_error", sep = "\t", col.names = FALSE, row.names = FALSE)
merge_cv_error <- read.table("merge_cv_error", header = F)
# CV error boxplot LD01
ggplot(merge_cv_error, aes(x = factor(merge_cv_error[,1]), y = merge_cv_error[,2])) +
geom_hline(
yintercept = c(0.78, 0.785, 0.79, 0.795, 0.8, 0.805, 0.81, 0.815),
color = "black",
linetype = "solid",
size = 0.5
) +
geom_boxplot(width = 0.5, fill = "yellow", color = "black") +
labs(title = "Cross-validation Error Plot",
x = "K",
y = "CV") +
scale_y_continuous(labels = scales::number_format(accuracy = 0.001)) +
theme_minimal() +
theme(
panel.border = element_rect(color = "black", fill = NA, size = 1),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank()
) +
coord_cartesian(ylim = c(0.78, 0.815))

# jitter boxplot LD01
ggplot(merge_cv_error, aes(x = factor(merge_cv_error[,1]), y = merge_cv_error[,2])) +
geom_hline(
yintercept = c(0.78, 0.785, 0.79, 0.795, 0.8, 0.805, 0.81, 0.815),
color = "black",
linetype = "solid",
size = 0.5
) +
geom_boxplot(width = 0.5, fill = "yellow", color = "black", outlier.shape = NA) +
geom_jitter(width = 0.2, alpha = 0.7, color = "red") +
labs(title = "Cross-validation Error Plot",
x = "K",
y = "CV") +
scale_y_continuous(labels = scales::number_format(accuracy = 0.001)) +
theme_minimal() +
theme(
panel.border = element_rect(color = "black", fill = NA, size = 1),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank()
) +
coord_cartesian(ylim = c(0.78, 0.815))

LD pruning = 0.01 (fenêtre de 1749 SNPs et pas de 175 bp)
##### LD001
setwd("~/Documents/Stage_NB/data/maf001_LD001")
# Étape 1: Créer une liste vide pour stocker les données
liste_de_donnees <- list()
# Étape 2: Parcourir les fichiers
for (i in 1:10) {
# Générer le nom du fichier
merge_cv_error <- paste0('SeqApiPop_629_maf001_LD001_', i, '.cv.error')
# Lire les données du fichier
donnees <- read.table(merge_cv_error, header = FALSE)
# Ajouter les données à la liste
liste_de_donnees[[i]] <- donnees
}
# Étape 3: Combiner les données en une seule structure
donnees_combinees <- do.call(rbind, liste_de_donnees)
# Étape 4: Enregistrer le résultat final sans numéro de lignes
write.table(donnees_combinees, "merge_cv_error", sep = "\t", col.names = FALSE, row.names = FALSE)
merge_cv_error <- read.table("merge_cv_error", header = F)
# CV error boxplot LD01
ggplot(merge_cv_error, aes(x = factor(merge_cv_error[,1]), y = merge_cv_error[,2])) +
geom_hline(
yintercept = c(0.895, 0.9, 0.905, 0.91, 0.915, 0.92, 0.925, 0.93, 0.935, 0.94, 0.945),
color = "black",
linetype = "solid",
size = 0.5
) +
geom_boxplot(width = 0.5, fill = "yellow", color = "black", outlier.shape = NA) +
labs(title = "Cross-validation Error Plot",
x = "K",
y = "CV") +
scale_y_continuous(labels = scales::number_format(accuracy = 0.001)) +
theme_minimal() +
theme(
panel.border = element_rect(color = "black", fill = NA, size = 1),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank()
) +
coord_cartesian(ylim = c(0.895, 0.945))

# jitter boxplot LD01
ggplot(merge_cv_error, aes(x = factor(merge_cv_error[,1]), y = merge_cv_error[,2])) +
geom_hline(
yintercept = c(0.895, 0.9, 0.905, 0.91, 0.915, 0.92, 0.925, 0.93, 0.935, 0.94, 0.945),
color = "black",
linetype = "solid",
size = 0.5
) +
geom_boxplot(width = 0.5, fill = "yellow", color = "black", outlier.shape = NA) +
geom_jitter(width = 0.2, alpha = 0.7, color = "red") +
labs(title = "Cross-validation Error Plot",
x = "K",
y = "CV") +
scale_y_continuous(labels = scales::number_format(accuracy = 0.001)) +
theme_minimal() +
theme(
panel.border = element_rect(color = "black", fill = NA, size = 1),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank()
) +
coord_cartesian(ylim = c(0.895, 0.945))

SeqApiPop - 561 échantillons - MAF > 0.01
LD pruning = 0.3 (fenêtre de 1749 SNPs et pas de 175 bp)
# LD03 - CV Error plot
setwd("~/Documents/Stage_NB/data/SeqApiPop_561_maf001_LD03")
cv_error <- read.table("SeqApiPop_561_maf001_LD03_1.cv.error", header = F)
# Étape 1: Créer une liste vide pour stocker les données
liste_de_donnees <- list()
# Étape 2: Parcourir les fichiers
for (i in 1:30) {
# Générer le nom du fichier
merge_cv_error <- paste0('SeqApiPop_561_maf001_LD03_', i, '.cv.error')
# Lire les données du fichier
donnees <- read.table(merge_cv_error, header = FALSE)
# Ajouter les données à la liste
liste_de_donnees[[i]] <- donnees
}
# Étape 3: Combiner les données en une seule structure
donnees_combinees <- do.call(rbind, liste_de_donnees)
# Étape 4: Enregistrer le résultat final sans numéro de lignes
write.table(donnees_combinees, "merge_cv_error", sep = "\t", col.names = FALSE, row.names = FALSE)
merge_cv_error <- read.table("merge_cv_error", header = F)
point_min <- merge_cv_error[which.min(merge_cv_error[, 2]), ]
# CV error boxplot
ggplot(merge_cv_error, aes(x = factor(merge_cv_error[,1]), y = merge_cv_error[,2])) +
geom_hline(
yintercept = c(0.46, 0.465, 0.47,0.475, 0.480, 0.485, 0.490),
color = "black",
linetype = "solid",
size = 0.5
) +
geom_boxplot(width = 0.5, fill = "yellow", color = "black") +
labs(title = "Cross-validation Error Plot",
x = "K",
y = "CV") +
scale_y_continuous(labels = scales::number_format(accuracy = 0.001)) +
theme_minimal() +
theme(
panel.border = element_rect(color = "black", fill = NA, size = 1),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank()
) +
coord_cartesian(ylim = c(0.46, 0.49))

# jitter boxplot
ggplot(merge_cv_error, aes(x = factor(merge_cv_error[,1]), y = merge_cv_error[,2])) +
geom_hline(
yintercept = c(0.46, 0.465, 0.47,0.475, 0.480, 0.485, 0.490),
color = "black",
linetype = "solid",
size = 0.5
) +
geom_boxplot(width = 0.5, fill = "yellow", color = "black", outlier.shape = NA) +
geom_jitter(width = 0.2, alpha = 0.7, color = "red") +
labs(title = "Cross-validation Error Plot",
x = "K",
y = "CV") +
scale_y_continuous(labels = scales::number_format(accuracy = 0.001)) +
theme_minimal() +
theme(
panel.border = element_rect(color = "black", fill = NA, size = 1),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank()
) +
coord_cartesian(ylim = c(0.46, 0.49))

# Filtrer les données pour inclure uniquement les valeurs de K spécifiées
filtered_data <- merge_cv_error[merge_cv_error[,1] %in% c(3, 4, 5, 6, 7, 8, 9, 10, 11, 12), ]
# Créer le jitter plot avec les données filtrées
ggplot(filtered_data, aes(x = factor(V1), y = V2)) +
geom_hline(
yintercept = c(0.46, 0.465, 0.47, 0.475, 0.480, 0.485),
color = "black",
linetype = "solid",
size = 0.5
) +
geom_boxplot(width = 0.5, fill = "yellow", color = "black", outlier.shape = NA) +
geom_jitter(width = 0.2, alpha = 0.7, color = "red") +
labs(title = "Cross-validation Error Plot",
x = "K",
y = "CV") +
scale_y_continuous(labels = scales::number_format(accuracy = 0.001)) +
theme_minimal() +
theme(
panel.border = element_rect(color = "black", fill = NA, size = 1),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank()
) +
coord_cartesian(ylim = c(0.4625, 0.485))

SeqApiPop - 629 échantillons - SNPsBeeMuSe filtered
No LD pruning - 10030 SNPs
setwd("~/Documents/Stage_NB/data/SeqApiPop_629_SNPsBeeMuSe")
# Étape 1: Créer une liste vide pour stocker les données
liste_de_donnees <- list()
# Étape 2: Parcourir les fichiers
for (i in 1:30) {
# Générer le nom du fichier
merge_cv_error <- paste0('SeqApiPop_629_SNPsBeeMuSe_filtered_', i, '.cv.error')
# Lire les données du fichier
donnees <- read.table(merge_cv_error, header = FALSE)
# Ajouter les données à la liste
liste_de_donnees[[i]] <- donnees
}
# Étape 3: Combiner les données en une seule structure - data - frame
donnees_combinees <- do.call(rbind, liste_de_donnees)
# Étape 4: Enregistrer le résultat final sans numéro de lignes
write.table(donnees_combinees, "merge_cv_error", sep = "\t", col.names = FALSE, row.names = FALSE)
merge_cv_error <- read.table("merge_cv_error", header = F)
#box plot filtered
ggplot(merge_cv_error, aes(x = factor(merge_cv_error[,1]), y = merge_cv_error[,2])) +
geom_hline(
yintercept = c(0.72,0.725,0.73,0.735,0.74,0.745,0.75,0.755,0.76,0.765,0.77,0.775,0.78,0.785,0.79,0.795,0.8),
color = "black",
linetype = "solid",
size = 0.5
) +
geom_boxplot(width = 0.5, fill = "yellow", color = "black") +
labs(title = "Cross-validation Error Plot",
x = "K",
y = "CV") +
scale_y_continuous(labels = scales::number_format(accuracy = 0.001)) +
theme_minimal() +
theme(
panel.border = element_rect(color = "black", fill = NA, size = 1),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank()
) +
coord_cartesian(ylim = c(0.72, 0.8))

#jitter plot filtered
ggplot(merge_cv_error, aes(x = factor(merge_cv_error[,1]), y = merge_cv_error[,2])) +
geom_hline(
yintercept = c(0.72,0.725,0.73,0.735,0.74,0.745,0.75,0.755,0.76,0.765,0.77,0.775,0.78,0.785,0.79,0.795,0.8),
color = "black",
linetype = "solid",
size = 0.5
) +
geom_boxplot(width = 0.5, fill = "yellow", color = "black", outlier.shape = NA) +
geom_jitter(width = 0.2, alpha = 0.7, color = "red") +
labs(title = "Cross-validation Error Plot",
x = "K",
y = "CV") +
scale_y_continuous(labels = scales::number_format(accuracy = 0.001)) +
theme_minimal() +
theme(
panel.border = element_rect(color = "black", fill = NA, size = 1),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank()
) +
coord_cartesian(ylim = c(0.72, 0.8))

# Filtrer les données pour inclure uniquement les valeurs de K de 3, 4, 5, 6, 7, 8, 9, 10, 11, 12
filtered_data <- merge_cv_error[merge_cv_error[,1] %in% c(3, 4, 5, 6, 7, 8, 9, 10, 11, 12), ]
# Créer le jitter plot avec les données filtrées
ggplot(filtered_data, aes(x = factor(V1), y = V2)) +
geom_hline(
yintercept = c(0.72, 0.725, 0.73, 0.735, 0.74, 0.745),
color = "black",
linetype = "solid",
size = 0.5
) +
geom_boxplot(width = 0.5, fill = "yellow", color = "black", outlier.shape = NA) +
geom_jitter(width = 0.2, alpha = 0.7, color = "red") +
labs(title = "Cross-validation Error Plot",
x = "K",
y = "CV") +
scale_y_continuous(labels = scales::number_format(accuracy = 0.001)) +
theme_minimal() +
theme(
panel.border = element_rect(color = "black", fill = NA, size = 1),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank()
) +
coord_cartesian(ylim = c(0.72, 0.745))

MAF > 0.01 - LD pruning = 0.3 (fenêtre de 1749 SNPs et pas de 175
bp) - 3848 SNPs
setwd("~/Documents/Stage_NB/data/SeqApiPop_629_SNPsBeeMuSe")
# Étape 1: Créer une liste vide pour stocker les données
liste_de_donnees <- list()
# Étape 2: Parcourir les fichiers
for (i in 1:30) {
# Générer le nom du fichier
merge_cv_error <- paste0('SeqApiPop_629_SNPsBeeMuSe_filtered_maf001_LD03_', i, '.cv.error')
# Lire les données du fichier
donnees <- read.table(merge_cv_error, header = FALSE)
# Ajouter les données à la liste
liste_de_donnees[[i]] <- donnees
}
# Étape 3: Combiner les données en une seule structure
donnees_combinees <- do.call(rbind, liste_de_donnees)
# Étape 4: Enregistrer le résultat final sans numéro de lignes
write.table(donnees_combinees, "merge_cv_error", sep = "\t", col.names = FALSE, row.names = FALSE)
merge_cv_error <- read.table("merge_cv_error", header = F)
#box plot LD03 filtered
ggplot(merge_cv_error, aes(x = factor(merge_cv_error[,1]), y = merge_cv_error[,2])) +
geom_hline(
yintercept = c(0.765, 0.77, 0.775, 0.78, 0.785, 0.79, 0.795, 0.8, 0.805, 0.81, 0.815,0.82,0.825,0.83,0.835,0.84),
color = "black",
linetype = "solid",
size = 0.5
) +
geom_boxplot(width = 0.5, fill = "yellow", color = "black") +
labs(title = "Cross-validation Error Plot",
x = "K",
y = "CV") +
scale_y_continuous(labels = scales::number_format(accuracy = 0.001)) +
theme_minimal() +
theme(
panel.border = element_rect(color = "black", fill = NA, size = 1),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank()
) +
coord_cartesian(ylim = c(0.765, 0.84))

#jitter plot LD03 filtered
ggplot(merge_cv_error, aes(x = factor(merge_cv_error[,1]), y = merge_cv_error[,2])) +
geom_hline(
yintercept = c(0.765, 0.77, 0.775, 0.78, 0.785, 0.79, 0.795, 0.8, 0.805, 0.81, 0.815,0.82,0.825,0.83,0.835,0.84),
color = "black",
linetype = "solid",
size = 0.5
) +
geom_boxplot(width = 0.5, fill = "yellow", color = "black", outlier.shape = NA) +
geom_jitter(width = 0.2, alpha = 0.7, color = "red") +
labs(title = "Cross-validation Error Plot",
x = "K",
y = "CV") +
scale_y_continuous(labels = scales::number_format(accuracy = 0.001)) +
theme_minimal() +
theme(
panel.border = element_rect(color = "black", fill = NA, size = 1),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank()
) +
coord_cartesian(ylim = c(0.765, 0.84))

MAF > 0.01 - LD pruning = 0.1 (fenêtre de 50 SNPs et pas de 10
bp) - 1055 SNPs
setwd("~/Documents/Stage_NB/data/SeqApiPop_629_SNPsBeeMuSe")
# Étape 1: Créer une liste vide pour stocker les données
liste_de_donnees <- list()
# Étape 2: Parcourir les fichiers
for (i in 1:30) {
# Générer le nom du fichier
merge_cv_error <- paste0('SeqApiPop_629_SNPsBeeMuSe_filtered_maf001_LD03_default_', i, '.cv.error')
# Lire les données du fichier
donnees <- read.table(merge_cv_error, header = FALSE)
# Ajouter les données à la liste
liste_de_donnees[[i]] <- donnees
}
# Étape 3: Combiner les données en une seule structure
donnees_combinees <- do.call(rbind, liste_de_donnees)
# Étape 4: Enregistrer le résultat final sans numéro de lignes
write.table(donnees_combinees, "merge_cv_error", sep = "\t", col.names = FALSE, row.names = FALSE)
merge_cv_error <- read.table("merge_cv_error", header = F)
#box plot filtered
ggplot(merge_cv_error, aes(x = factor(merge_cv_error[,1]), y = merge_cv_error[,2])) +
geom_hline(
yintercept = c(0.755, 0.76, 0.765, 0.77, 0.775, 0.78, 0.785, 0.79, 0.795, 0.8, 0.805, 0.81, 0.815,0.82),
color = "black",
linetype = "solid",
size = 0.5
) +
geom_boxplot(width = 0.5, fill = "yellow", color = "black") +
labs(title = "Cross-validation Error Plot",
x = "K",
y = "CV") +
scale_y_continuous(labels = scales::number_format(accuracy = 0.001)) +
theme_minimal() +
theme(
panel.border = element_rect(color = "black", fill = NA, size = 1),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank()
) +
coord_cartesian(ylim = c(0.755, 0.82))

#jitter plot filtered
ggplot(merge_cv_error, aes(x = factor(merge_cv_error[,1]), y = merge_cv_error[,2])) +
geom_hline(
yintercept = c(0.755, 0.76, 0.765, 0.77, 0.775, 0.78, 0.785, 0.79, 0.795, 0.8, 0.805, 0.81, 0.815,0.82),
color = "black",
linetype = "solid",
size = 0.5
) +
geom_boxplot(width = 0.5, fill = "yellow", color = "black", outlier.shape = NA) +
geom_jitter(width = 0.2, alpha = 0.7, color = "red") +
labs(title = "Cross-validation Error Plot",
x = "K",
y = "CV") +
scale_y_continuous(labels = scales::number_format(accuracy = 0.001)) +
theme_minimal() +
theme(
panel.border = element_rect(color = "black", fill = NA, size = 1),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank()
) +
coord_cartesian(ylim = c(0.755, 0.82))

# Filtrer les données pour inclure uniquement les valeurs de K de 3, 4, 5, 6, 7, 8, 9, 10, 11, 12
filtered_data <- merge_cv_error[merge_cv_error[,1] %in% c(3, 4, 5, 6, 7, 8, 9, 10, 11, 12), ]
# Créer le jitter plot avec les données filtrées
ggplot(filtered_data, aes(x = factor(V1), y = V2)) +
geom_hline(
yintercept = c(0.755, 0.76, 0.765, 0.77, 0.775, 0.78, 0.785),
color = "black",
linetype = "solid",
size = 0.5
) +
geom_boxplot(width = 0.5, fill = "yellow", color = "black", outlier.shape = NA) +
geom_jitter(width = 0.2, alpha = 0.7, color = "red") +
labs(title = "Cross-validation Error Plot",
x = "K",
y = "CV") +
scale_y_continuous(labels = scales::number_format(accuracy = 0.001)) +
theme_minimal() +
theme(
panel.border = element_rect(color = "black", fill = NA, size = 1),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank()
) +
coord_cartesian(ylim = c(0.755, 0.785))

SeqApiPop - 561 échantillons - SNPsBeeMuSe filtered
No LD pruning - 10030 SNPs
setwd("~/Documents/Stage_NB/data/SeqApiPop_561_SNPsBeeMuSe")
# Étape 1: Créer une liste vide pour stocker les données
liste_de_donnees <- list()
# Étape 2: Parcourir les fichiers
for (i in 1:30) {
# Générer le nom du fichier
merge_cv_error <- paste0('SeqApiPop_561_SNPsBeeMuSe_filtered_', i, '.cv.error')
# Lire les données du fichier
donnees <- read.table(merge_cv_error, header = FALSE)
# Ajouter les données à la liste
liste_de_donnees[[i]] <- donnees
}
# Étape 3: Combiner les données en une seule structure - data - frame
donnees_combinees <- do.call(rbind, liste_de_donnees)
# Étape 4: Enregistrer le résultat final sans numéro de lignes
write.table(donnees_combinees, "merge_cv_error", sep = "\t", col.names = FALSE, row.names = FALSE)
merge_cv_error <- read.table("merge_cv_error", header = F)
#box plot filtered
ggplot(merge_cv_error, aes(x = factor(merge_cv_error[,1]), y = merge_cv_error[,2])) +
geom_hline(
yintercept = c(0.755,0.76,0.765,0.77,0.775,0.78,0.785,0.79,0.795,0.8,0.805,0.81,0.815,0.82,0.825,0.83,0.835,0.84),
color = "black",
linetype = "solid",
size = 0.5
) +
geom_boxplot(width = 0.5, fill = "yellow", color = "black") +
labs(title = "Cross-validation Error Plot",
x = "K",
y = "CV") +
scale_y_continuous(labels = scales::number_format(accuracy = 0.001)) +
theme_minimal() +
theme(
panel.border = element_rect(color = "black", fill = NA, size = 1),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank()
) +
coord_cartesian(ylim = c(0.755, 0.84))

#jitter plot filtered
ggplot(merge_cv_error, aes(x = factor(merge_cv_error[,1]), y = merge_cv_error[,2])) +
geom_hline(
yintercept = c(0.755,0.76,0.765,0.77,0.775,0.78,0.785,0.79,0.795,0.8,0.805,0.81,0.815,0.82,0.825,0.83,0.835,0.84),
color = "black",
linetype = "solid",
size = 0.5
) +
geom_boxplot(width = 0.5, fill = "yellow", color = "black", outlier.shape = NA) +
geom_jitter(width = 0.2, alpha = 0.7, color = "red") +
labs(title = "Cross-validation Error Plot",
x = "K",
y = "CV") +
scale_y_continuous(labels = scales::number_format(accuracy = 0.001)) +
theme_minimal() +
theme(
panel.border = element_rect(color = "black", fill = NA, size = 1),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank()
) +
coord_cartesian(ylim = c(0.755, 0.84))

# Filtrer les données pour inclure uniquement les valeurs de K de 3, 4, 5, 6, 7, 8, 9, 10, 11, 12
filtered_data <- merge_cv_error[merge_cv_error[,1] %in% c(3, 4, 5, 6, 7, 8, 9, 10, 11, 12), ]
# Créer le jitter plot avec les données filtrées
ggplot(filtered_data, aes(x = factor(V1), y = V2)) +
geom_hline(
yintercept = c(0.72, 0.725, 0.73, 0.735, 0.74, 0.745,0.75,0.755,0.76,0.765,0.77,0.775),
color = "black",
linetype = "solid",
size = 0.5
) +
geom_boxplot(width = 0.5, fill = "yellow", color = "black", outlier.shape = NA) +
geom_jitter(width = 0.2, alpha = 0.7, color = "red") +
labs(title = "Cross-validation Error Plot",
x = "K",
y = "CV") +
scale_y_continuous(labels = scales::number_format(accuracy = 0.001)) +
theme_minimal() +
theme(
panel.border = element_rect(color = "black", fill = NA, size = 1),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank()
) +
coord_cartesian(ylim = c(0.758, 0.775))

MAF > 0.01 - LD pruning = 0.3 (fenêtre de 1749 SNPS et pas de 175
bp) - 3848 SNPs
setwd("~/Documents/Stage_NB/data/SeqApiPop_561_SNPsBeeMuse_LD03")
liste_de_donnees <- list()
for (i in 1:30) {
merge_cv_error <- paste0('SeqApiPop_561_SNPsBeeMuSe_filtered_maf001_LD03_pruned_', i, '.cv.error')
donnees <- read.table(merge_cv_error, header = FALSE)
liste_de_donnees[[i]] <- donnees
}
donnees_combinees <- do.call(rbind, liste_de_donnees)
write.table(donnees_combinees, "merge_cv_error", sep = "\t", col.names = FALSE, row.names = FALSE)
merge_cv_error <- read.table("merge_cv_error", header = F)
#box plot LD03 filtered
ggplot(merge_cv_error, aes(x = factor(merge_cv_error[,1]), y = merge_cv_error[,2])) +
geom_hline(
yintercept = c(0.775, 0.78, 0.785, 0.79,0.795,0.8,0.805,0.81,0.815,0.82,0.825,0.83,0.835,0.84,0.845,0.85),
color = "black",
linetype = "solid",
size = 0.5
) +
geom_boxplot(width = 0.5, fill = "yellow", color = "black") +
labs(title = "Cross-validation Error Plot",
x = "K",
y = "CV") +
scale_y_continuous(labels = scales::number_format(accuracy = 0.001)) +
theme_minimal() +
theme(
panel.border = element_rect(color = "black", fill = NA, size = 1),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank()
) +
coord_cartesian(ylim = c(0.775, 0.85))

#jitter plot LD03 filtered
ggplot(merge_cv_error, aes(x = factor(merge_cv_error[,1]), y = merge_cv_error[,2])) +
geom_hline(
yintercept = c(0.775, 0.78, 0.785, 0.79,0.795,0.8,0.805,0.81,0.815,0.82,0.825,0.83,0.835,0.84,0.845,0.85),
color = "black",
linetype = "solid",
size = 0.5
) +
geom_boxplot(width = 0.5, fill = "yellow", color = "black", outlier.shape = NA) +
geom_jitter(width = 0.2, alpha = 0.7, color = "red") +
labs(title = "Cross-validation Error Plot",
x = "K",
y = "CV") +
scale_y_continuous(labels = scales::number_format(accuracy = 0.001)) +
theme_minimal() +
theme(
panel.border = element_rect(color = "black", fill = NA, size = 1),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank()
) +
coord_cartesian(ylim = c(0.775, 0.85))

MAF > 0.01 - LD pruning = 0.1 (fenêtre de 50 SNPS et pas de 10
bp) - 1055 SNPs
setwd("~/Documents/Stage_NB/data/SeqApiPop_561_SNPsBeeMuSe_LD_default")
# Étape 1: Créer une liste vide pour stocker les données
liste_de_donnees <- list()
# Étape 2: Parcourir les fichiers
for (i in 1:30) {
# Générer le nom du fichier
merge_cv_error <- paste0('SeqApiPop_561_SNPsBeeMuSe_filtered_maf001_LD03_default_', i, '.cv.error')
# Lire les données du fichier
donnees <- read.table(merge_cv_error, header = FALSE)
# Ajouter les données à la liste
liste_de_donnees[[i]] <- donnees
}
# Étape 3: Combiner les données en une seule structure
donnees_combinees <- do.call(rbind, liste_de_donnees)
# Étape 4: Enregistrer le résultat final sans numéro de lignes
write.table(donnees_combinees, "merge_cv_error", sep = "\t", col.names = FALSE, row.names = FALSE)
merge_cv_error <- read.table("merge_cv_error", header = F)
#box plot filtered
ggplot(merge_cv_error, aes(x = factor(merge_cv_error[,1]), y = merge_cv_error[,2])) +
geom_hline(
yintercept = c(0.765,0.77, 0.775, 0.78, 0.785, 0.79,0.795,0.8,0.805,0.81,0.815,0.82,0.825,0.83),
color = "black",
linetype = "solid",
size = 0.5
) +
geom_boxplot(width = 0.5, fill = "yellow", color = "black") +
labs(title = "Cross-validation Error Plot",
x = "K",
y = "CV") +
scale_y_continuous(labels = scales::number_format(accuracy = 0.001)) +
theme_minimal() +
theme(
panel.border = element_rect(color = "black", fill = NA, size = 1),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank()
) +
coord_cartesian(ylim = c(0.765, 0.83))

#jitter plot filtered
ggplot(merge_cv_error, aes(x = factor(merge_cv_error[,1]), y = merge_cv_error[,2])) +
geom_hline(
yintercept = c(0.765,0.77, 0.775, 0.78, 0.785, 0.79,0.795,0.8,0.805,0.81,0.815,0.82,0.825,0.83),
color = "black",
linetype = "solid",
size = 0.5
) +
geom_boxplot(width = 0.5, fill = "yellow", color = "black", outlier.shape = NA) +
geom_jitter(width = 0.2, alpha = 0.7, color = "red") +
labs(title = "Cross-validation Error Plot",
x = "K",
y = "CV") +
scale_y_continuous(labels = scales::number_format(accuracy = 0.001)) +
theme_minimal() +
theme(
panel.border = element_rect(color = "black", fill = NA, size = 1),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank()
) +
coord_cartesian(ylim = c(0.765, 0.83))

# Valeurs de K à inclure dans le graphique
k_values <- c(3, 4, 5, 6, 7, 8, 9, 10, 11, 12)
# Filtrer les données pour inclure uniquement les valeurs de K spécifiées
filtered_data <- merge_cv_error[merge_cv_error[,1] %in% k_values, ]
# Créer le jitter plot avec les données filtrées
ggplot(filtered_data, aes(x = factor(V1), y = V2)) +
geom_hline(
yintercept = c(0.765, 0.77, 0.775, 0.78, 0.785, 0.79),
color = "black",
linetype = "solid",
size = 0.5
) +
geom_boxplot(width = 0.5, fill = "yellow", color = "black", outlier.shape = NA) +
geom_jitter(width = 0.2, alpha = 0.7, color = "red") +
labs(title = "Cross-validation Error Plot",
x = "K",
y = "CV") +
scale_y_continuous(labels = scales::number_format(accuracy = 0.001)) +
theme_minimal() +
theme(
panel.border = element_rect(color = "black", fill = NA, size = 1),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank()
) +
coord_cartesian(ylim = c(0.767, 0.79))

BeeMuSe - 12000 SNPs
K_new <- c(2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 34, 35, 36, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70)
CV_new <- c(0.48943, 0.46307, 0.44338, 0.42781, 0.41962, 0.40411, 0.40055, 0.38682, 0.38044, 0.37802, 0.37102, 0.36619, 0.36130, 0.35863, 0.35431, 0.35162, 0.34960, 0.34742, 0.34640, 0.34555, 0.34281, 0.34187, 0.34055, 0.34081, 0.33861, 0.33930, 0.33562, 0.33588, 0.33543, 0.33518, 0.33262, 0.33176, 0.33216, 0.33019, 0.33013, 0.32908, 0.33166, 0.32875, 0.32961, 0.32977, 0.32796, 0.32878, 0.32828, 0.32772, 0.32533, 0.32954, 0.32862, 0.32522, 0.32771, 0.32770, 0.32558, 0.33150, 0.33390, 0.32462, 0.32938, 0.32893, 0.33254, 0.32727, 0.32883, 0.33015, 0.33254, 0.33260, 0.33633, 0.33744, 0.33756, 0.33277, 0.33579)
# Trouver l'indice de la valeur la plus basse de CV
indice_min_new <- which.min(CV_new)
# Créer le graphique
plot(K_new, CV_new, type="l", col="black", xlab="K", ylab="CV", main="CV error plot - BeeMuSe 3848 SNPs")
# Ajouter la ligne avec le trait hachuré bleu pour la valeur la plus basse
abline(h=CV_new[indice_min_new], col="blue", lty=2)
# Ajouter la droite verticale rouge pour la valeur la plus basse de CV
abline(v=K_new[indice_min_new], col="red", lty=2)
# Ajouter les lignes de grille horizontales à intervalles de 0.01
abline(h=seq(0, 1, by=0.01), col="lightgray")
# Ajouter la légende avec la valeur de K correspondant à la plus basse erreur CV
legend("topright", legend=sprintf("lowest CV error: %.5f (K = %d)", CV_new[indice_min_new], K_new[indice_min_new]), col="blue", lty=2, cex=0.8)

Merged Data - BeeMuSe SeqApiPop
MAF > 0.01 - LD pruning = 0.3 (fenêtre de 1749 SNPS et pas de 175
bp)
# Valeurs CV - Admixture non supervisée
K <- c(2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48 ,49, 50)
CV <- c(0.66344, 0.62032, 0.60790, 0.60015, 0.58943, 0.58458, 0.58509, 0.57812, 0.57116, 0.56933, 0.56023, 0.56145, 0.55645, 0.55416, 0.55222, 0.54907, 0.54576, 0.54370, 0.54270, 0.54220, 0.54162, 0.54375, 0.54147, 0.53958, 0.54064, 0.53829, 0.54184, 0.53940, 0.53965, 0.54062, 0.53549, 0.54011, 0.53806, 0.53622, 0.53630, 0.53986, 0.54062, 0.53707, 0.54252, 0.53593, 0.53901, 0.54513, 0.54334, 0.54278, 0.54128, 0.54482, 0.54184, 0.55066, 0.55151
)
# Trouver l'indice de la valeur la plus basse de CV
indice_min <- which.min(CV)
# Créer le graphique
plot(K, CV, type="l", col="black", xlab="K", ylab="CV", main="CV error plot - Merged BeeMuSe SeqApiPop 3848 SNPs")
# Ajouter la ligne avec le trait hachuré bleu pour la valeur la plus basse
abline(h=CV[indice_min], col="blue", lty=2)
# Ajouter la droite verticale rouge pour la valeur la plus basse de CV
abline(v=K[indice_min], col="red", lty=2)
# Ajouter les lignes de grille horizontales à intervalles de 0.01
abline(h=seq(0, 1, by=0.01), col="lightgray")
# Ajouter la légende
legend("topright", legend=sprintf("lowest CV error: %.5f (K = %d)", CV[indice_min], K[indice_min]), col="blue", lty=2, cex=0.8)

# Nouvelles données
K <- c(2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50)
CV <- c(0.66350, 0.62020, 0.60778, 0.59670, 0.58968, 0.58467, 0.58258, 0.57804, 0.57563, 0.56626, 0.56028, 0.55808, 0.55537, 0.55140, 0.55279, 0.54843, 0.54649, 0.54583, 0.54412, 0.54363, 0.54187, 0.54172, 0.54028, 0.53822, 0.54265, 0.53707, 0.53598, 0.53746, 0.54359, 0.53759, 0.53651, 0.53563, 0.53614, 0.53701, 0.53993, 0.53787, 0.53929, 0.53656, 0.53982, 0.54181, 0.54237, 0.53793, 0.54851, 0.54505, 0.54506, 0.54203, 0.54622, 0.55210, 0.55177)
# Trouver l'indice de la valeur la plus basse de CV
indice_min <- which.min(CV)
# Créer le graphique
plot(K, CV, type="l", col="black", xlab="K", ylab="CV", main="CV error plot - Merged BeeMuSe SeqApiPop 3848 SNPs")
# Ajouter la ligne avec le trait hachuré bleu pour la valeur la plus basse
abline(h=CV[indice_min], col="blue", lty=2)
# Ajouter la droite verticale rouge pour la valeur la plus basse de CV
abline(v=K[indice_min], col="red", lty=2)
# Ajouter les lignes de grille horizontales à intervalles de 0.01
abline(h=seq(0, 1, by=0.01), col="lightgray")
# Ajouter la légende
legend("topright", legend=sprintf("lowest CV error: %.5f (K = %d)", CV[indice_min], K[indice_min]), col="blue", lty=2, cex=0.8)

629 échantillons - K2 à K9 - 30 exécutions
setwd("~/Documents/Stage_NB/data/merged_BeeMuSe_SeqApiPop_629_filtered_maf001_LD03")
# Étape 1: Créer une liste vide pour stocker les données
liste_de_donnees <- list()
# Étape 2: Parcourir les fichiers
for (i in 1:30) {
# Générer le nom du fichier
merge_cv_error <- paste0('merged_BeeMuSe_SeqApiPop_629_filtered_maf001_LD03_', i, '.cv.error')
# Lire les données du fichier
donnees <- read.table(merge_cv_error, header = FALSE)
# Ajouter les données à la liste
liste_de_donnees[[i]] <- donnees
}
# Étape 3: Combiner les données en une seule structure
donnees_combinees <- do.call(rbind, liste_de_donnees)
# Étape 4: Enregistrer le résultat final sans numéro de lignes
write.table(donnees_combinees, "merge_cv_error", sep = "\t", col.names = FALSE, row.names = FALSE)
merge_cv_error <- read.table("merge_cv_error", header = F)
#box plot LD03 filtered
ggplot(merge_cv_error, aes(x = factor(merge_cv_error[,1]), y = merge_cv_error[,2])) +
geom_hline(
yintercept = c(0.55, 0.56, 0.57, 0.58, 0.59, 0.6, 0.61, 0.62, 0.63, 0.64, 0.65, 0.66, 0.67),
color = "black",
linetype = "solid",
size = 0.5
) +
geom_boxplot(width = 0.5, fill = "yellow", color = "black") +
labs(title = "Cross-validation Error Plot",
x = "K",
y = "CV") +
scale_y_continuous(labels = scales::number_format(accuracy = 0.001)) +
theme_minimal() +
theme(
panel.border = element_rect(color = "black", fill = NA, size = 1),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank()
) +
coord_cartesian(ylim = c(0.55, 0.67))

#jitter plot LD03 filtered
ggplot(merge_cv_error, aes(x = factor(merge_cv_error[,1]), y = merge_cv_error[,2])) +
geom_hline(
yintercept = c(0.55, 0.56, 0.57, 0.58, 0.59, 0.6, 0.61, 0.62, 0.63, 0.64, 0.65, 0.66, 0.67),
color = "black",
linetype = "solid",
size = 0.5
) +
geom_boxplot(width = 0.5, fill = "yellow", color = "black", outlier.shape = NA) +
geom_jitter(width = 0.2, alpha = 0.7, color = "red") +
labs(title = "Cross-validation Error Plot",
x = "K",
y = "CV") +
scale_y_continuous(labels = scales::number_format(accuracy = 0.001)) +
theme_minimal() +
theme(
panel.border = element_rect(color = "black", fill = NA, size = 1),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank()
) +
coord_cartesian(ylim = c(0.55, 0.67))

561 échantillons - K2 à K9 - 30 exécutions
setwd("~/Documents/Stage_NB/data/merged_data_3848_561_not_supervised")
liste_de_donnees <- list()
for (i in 1:30) {
merge_cv_error <- paste0('merged_BeeMuSe_SeqApiPop_561_filtered_maf001_LD03_', i, '.cv.error')
donnees <- read.table(merge_cv_error, header = FALSE)
liste_de_donnees[[i]] <- donnees
}
donnees_combinees <- do.call(rbind, liste_de_donnees)
write.table(donnees_combinees, "merge_cv_error", sep = "\t", col.names = FALSE, row.names = FALSE)
merge_cv_error <- read.table("merge_cv_error", header = F)
#box plot filtered
ggplot(merge_cv_error, aes(x = factor(merge_cv_error[,1]), y = merge_cv_error[,2])) +
geom_hline(
yintercept = c(0.54,0.55, 0.56, 0.57, 0.58, 0.59, 0.6, 0.61, 0.62, 0.63, 0.64, 0.65,0.66,0.67),
color = "black",
linetype = "solid",
size = 0.5
) +
geom_boxplot(width = 0.5, fill = "yellow", color = "black") +
labs(title = "Cross-validation Error Plot",
x = "K",
y = "CV") +
scale_y_continuous(labels = scales::number_format(accuracy = 0.001)) +
theme_minimal() +
theme(
panel.border = element_rect(color = "black", fill = NA, size = 1),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank()
) +
coord_cartesian(ylim = c(0.55, 0.67))

#jitter plot filtered
ggplot(merge_cv_error, aes(x = factor(merge_cv_error[,1]), y = merge_cv_error[,2])) +
geom_hline(
yintercept = c(0.54,0.55, 0.56, 0.57, 0.58, 0.59, 0.6, 0.61, 0.62, 0.63, 0.64, 0.65,0.66,0.67),
color = "black",
linetype = "solid",
size = 0.5
) +
geom_boxplot(width = 0.5, fill = "yellow", color = "black", outlier.shape = NA) +
geom_jitter(width = 0.2, alpha = 0.7, color = "red") +
labs(title = "Cross-validation Error Plot",
x = "K",
y = "CV") +
scale_y_continuous(labels = scales::number_format(accuracy = 0.001)) +
theme_minimal() +
theme(
panel.border = element_rect(color = "black", fill = NA, size = 1),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank()
) +
coord_cartesian(ylim = c(0.55, 0.67))

MAF > 0.01 - LD pruning = 0.1 (fenêtre de 50 SNPS et pas de 10
bp)
561 échantillons - K2 à K9 - 30 exécutions
setwd("~/Documents/Stage_NB/data/merged_data_1055_561_not_supervised")
liste_de_donnees <- list()
for (i in 1:30) {
merge_cv_error <- paste0('merged_BeeMuSe_SeqApiPop_561_filtered_MAF001_LD_default_', i, '.cv.error')
donnees <- read.table(merge_cv_error, header = FALSE)
liste_de_donnees[[i]] <- donnees
}
donnees_combinees <- do.call(rbind, liste_de_donnees)
write.table(donnees_combinees, "merge_cv_error", sep = "\t", col.names = FALSE, row.names = FALSE)
merge_cv_error <- read.table("merge_cv_error", header = F)
#box plot filtered
ggplot(merge_cv_error, aes(x = factor(merge_cv_error[,1]), y = merge_cv_error[,2])) +
geom_hline(
yintercept = c(0.54,0.55, 0.56, 0.57, 0.58, 0.59, 0.6, 0.61, 0.62, 0.63, 0.64, 0.65),
color = "black",
linetype = "solid",
size = 0.5
) +
geom_boxplot(width = 0.5, fill = "yellow", color = "black") +
labs(title = "Cross-validation Error Plot",
x = "K",
y = "CV") +
scale_y_continuous(labels = scales::number_format(accuracy = 0.001)) +
theme_minimal() +
theme(
panel.border = element_rect(color = "black", fill = NA, size = 1),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank()
) +
coord_cartesian(ylim = c(0.54, 0.65))

#jitter plot filtered
ggplot(merge_cv_error, aes(x = factor(merge_cv_error[,1]), y = merge_cv_error[,2])) +
geom_hline(
yintercept = c(0.54,0.55, 0.56, 0.57, 0.58, 0.59, 0.6, 0.61, 0.62, 0.63, 0.64, 0.65),
color = "black",
linetype = "solid",
size = 0.5
) +
geom_boxplot(width = 0.5, fill = "yellow", color = "black", outlier.shape = NA) +
geom_jitter(width = 0.2, alpha = 0.7, color = "red") +
labs(title = "Cross-validation Error Plot",
x = "K",
y = "CV") +
scale_y_continuous(labels = scales::number_format(accuracy = 0.001)) +
theme_minimal() +
theme(
panel.border = element_rect(color = "black", fill = NA, size = 1),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank()
) +
coord_cartesian(ylim = c(0.54, 0.65))
